{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "0_a2jpY-JpY9"
   },
   "outputs": [],
   "source": [
    "!pip install -U tomotopy &> /dev/null # topic modeling library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:85% !important; }</style>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "8sSKmD6iJpZH",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "import tomotopy as tp\n",
    "import collections\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.colors as clr\n",
    "import matplotlib.font_manager as fm\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "pd.set_option('display.max_colwidth', -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('c:/pythonwork/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DF = pd.read_csv('c:/pythonwork/ptm_twitter.csv', encoding='utf-8')\n",
    "\n",
    "DF['All_out'].replace('', np.nan, inplace=True) \n",
    "DF.dropna(subset=['All_out'], inplace=True)\n",
    "DF['All_out'] = DF['All_out'].str.replace(' +', ' ')\n",
    "\n",
    "print(len(DF))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "73dAdGH0JpZN"
   },
   "outputs": [],
   "source": [
    "def compute_coherence_values(doc, start, limit, step):\n",
    "    for i in range(start, limit, step):\n",
    "        mdl = tp.PTModel(k=i, seed=7777)\n",
    "        mdl.burn_in = 100\n",
    "        \n",
    "        for text in doc:\n",
    "            token = text.strip().split()\n",
    "            if token:\n",
    "                mdl.add_doc(token)\n",
    "                \n",
    "        mdl.train(iter=500, workers=0)\n",
    "        coh = tp.coherence.Coherence(mdl, coherence='c_v')\n",
    "        coherence_per_topic = coh.get_score()\n",
    "        perplexity_per_topic = mdl.perplexity\n",
    "\n",
    "        print('Topic: {}\\tLog-likelihood: {}\\tPerplexity: {}\\tCoherence: {}'\\\n",
    "        .format(\n",
    "            i,\n",
    "            mdl.ll_per_word,\n",
    "            perplexity_per_topic,\n",
    "            coherence_per_topic\n",
    "            )\n",
    "        )\n",
    "        coherence_values.append(coherence_per_topic) \n",
    "        perplexities.append(perplexity_per_topic)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "gEwF4_GJJpZO",
    "outputId": "1d64916e-43a1-45ef-b2db-60b6c6643a6e",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "limit = 45\n",
    "start = 15\n",
    "step = 5\n",
    "\n",
    "coherence_values = []\n",
    "perplexities = []\n",
    "\n",
    "compute_coherence_values(DF['All_out'], start, limit, step)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "pHab-HZOJpZQ",
    "outputId": "0b603b19-c4ab-4bce-f0e0-9847002094df",
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "x = range(start, limit, step)\n",
    "\n",
    "mdl_check = pd.DataFrame(list(zip(x, perplexities, coherence_values)), columns=['Num Topics', 'Perplexity', 'Coherence']) \n",
    "\n",
    "fig, ax = plt.subplots(figsize=(15,5), nrows=1, ncols=2) \n",
    "\n",
    "plt.subplots_adjust(left=0.125, \n",
    "                    bottom=0.1, \n",
    "                    right=0.9, \n",
    "                    top=0.9, \n",
    "                    wspace=0.3, \n",
    "                    hspace=0.5)\n",
    "\n",
    "pd.pivot_table(mdl_check, values=[\"Perplexity\", \"Coherence\"],index=\"Num Topics\")\\\n",
    "    .plot(kind='line', rot=90, ax=ax, subplots=True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mdl = tp.PTModel(k=35, seed=7777)\n",
    "mdl.burn_in = 100\n",
    "\n",
    "for text in DF['All_out']:\n",
    "    token = text.strip().split()\n",
    "    if token:\n",
    "        mdl.add_doc(token)\n",
    "mdl.train(iter=500, workers=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Table 1 Topic Keywords (Label is attached manually)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(mdl.k):\n",
    "    res = mdl.get_topic_words(i, top_n=20)\n",
    "    print('Topic #{}'.format(i), end='\\t')\n",
    "    print(', '.join(w for w, p in res))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_docs = [[] for _ in range(mdl.k)] \n",
    "\n",
    "for doc in mdl.docs:\n",
    "    top_docs[doc.get_topics(top_n=1)[0][0]].append(doc) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])\n",
    "doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])\n",
    "doc_lengths = np.array([len(doc.words) for doc in mdl.docs])\n",
    "vocab = list(mdl.used_vocabs)\n",
    "term_frequency = mdl.used_vocab_freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])\n",
    "df_topic = pd.DataFrame.from_records(doc_topic_dists) \n",
    "df_topic.columns = ['Topic 01', 'Topic 02', 'Topic 03', 'Topic 04', 'Topic 05', 'Topic 06', 'Topic 07', 'Topic 08', 'Topic 09', 'Topic 10',\n",
    "                    'Topic 11', 'Topic 12', 'Topic 13', 'Topic 14', 'Topic 15', 'Topic 16', 'Topic 17', 'Topic 18', 'Topic 19', 'Topic 20',\n",
    "                    'Topic 21', 'Topic 22', 'Topic 23', 'Topic 24', 'Topic 25', 'Topic 26', 'Topic 27', 'Topic 28', 'Topic 29', 'Topic 30',\n",
    "                    'Topic 31', 'Topic 32', 'Topic 33', 'Topic 34', 'Topic 35']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tm = pd.concat([DF['yymm'], df_topic], axis=1)\n",
    "df_tm.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "test = pd.read_excel('c:/pythonwork/result.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "colname=['Topic 01', 'Topic 02', 'Topic 03', 'Topic 04', 'Topic 05', 'Topic 06', 'Topic 07', 'Topic 08', 'Topic 09', 'Topic 10',\n",
    "         'Topic 11', 'Topic 12', 'Topic 13', 'Topic 14', 'Topic 15', 'Topic 16', 'Topic 17', 'Topic 18', 'Topic 19', 'Topic 20',\n",
    "         'Topic 21', 'Topic 22', 'Topic 23', 'Topic 24', 'Topic 25', 'Topic 26', 'Topic 27', 'Topic 28', 'Topic 29', 'Topic 30',\n",
    "         'Topic 31', 'Topic 32', 'Topic 33', 'Topic 34', 'Topic 35']\n",
    "TP_number = list(range(1,36))\n",
    "TP_name = ['Successful models', 'Patients and symptoms', 'Fatality in the US', 'Waves of infections', 'Testing, tracing, and treatment', 'Youth culture and pandemic',\n",
    "           'Government responses', 'Youth vaccination and Jeju island', 'Infections in US Forces Korea and the military', 'Death tolls in different countries',\n",
    "           'Economic crisis and recovery in the two Koreas', 'Use of technology', 'Outbreak and safety in cafe', 'Assessments of K-quarantine', 'Covid deaths in South Korea and US',\n",
    "           'Protest, oppression, and human rights', 'Authoritarian and developing countries', 'Church outbreaks', 'New spikes in Seoul and schools', 'Use of drugs',\n",
    "           'US-South Korea military exercise', 'Vaccination', 'Golf and golfing during pandemic', 'Election, party, and politics', 'Covid statistics and social distancing',\n",
    "           'Records in the Asia-Pacific region', 'Support and regulations', 'Maryland governor', 'Economic growth and market in Asia', 'Different vaccines', 'Motor companies',\n",
    "           'AI and fight against covid', 'Plasma donation from church members', 'Olympics and K-pop', 'Infections in East Asia']\n",
    "\n",
    "data = pd.pivot_table(test, values=colname, index=\"yymm\", aggfunc=\"mean\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure 1. The average trend of ‘countries' responses and records’ topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 24), nrows=4, ncols=2, sharey='col')\n",
    "row = 0; col = 0; cnt = 0\n",
    "\n",
    "plt.subplots_adjust(left=0.125,\n",
    "                bottom=0.2, \n",
    "                right=0.9, \n",
    "                top=0.9, \n",
    "                wspace=0.1, \n",
    "                hspace=0.3)\n",
    "\n",
    "custom_ylim = (0, 0.15)\n",
    "plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "for i in [0, 2, 6, 9, 13, 14, 25, 34]:\n",
    "\n",
    "    lo = ax[row, col]\n",
    "    lo.plot(data.index, data[colname[i]], color = \"black\")\n",
    "    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)\n",
    "    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')\n",
    "                \n",
    "    if(col == 1): col = 0\n",
    "    else: col = 1\n",
    "    cnt += 1\n",
    "    if(cnt % 2 == 0): row += 1\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure 2. The average trend of ‘infections, testing, tracing, and treatment, social distancing’ topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')\n",
    "row = 0; col = 0; cnt = 0\n",
    "\n",
    "plt.subplots_adjust(left=0.125,\n",
    "                bottom=0.2, \n",
    "                right=0.9, \n",
    "                top=0.9, \n",
    "                wspace=0.1, \n",
    "                hspace=0.3)\n",
    "\n",
    "custom_ylim = (0, 0.15)\n",
    "plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "for i in [3, 4, 18, 24]:\n",
    "\n",
    "    lo = ax[row, col]\n",
    "    lo.plot(data.index, data[colname[i]], color = \"black\")\n",
    "    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)\n",
    "    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')\n",
    "                \n",
    "    if(col == 1): col = 0\n",
    "    else: col = 1\n",
    "    cnt += 1\n",
    "    if(cnt % 2 == 0): row += 1\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure 3. The average trend of ‘vaccines and vaccination’ topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')\n",
    "row = 0; col = 0; cnt = 0\n",
    "\n",
    "plt.subplots_adjust(left=0.125,\n",
    "                bottom=0.2, \n",
    "                right=0.9, \n",
    "                top=0.9, \n",
    "                wspace=0.1, \n",
    "                hspace=0.3)\n",
    "\n",
    "custom_ylim = (0, 0.15)\n",
    "plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "for i in [19, 21, 29]:\n",
    "\n",
    "    lo = ax[row, col]\n",
    "    lo.plot(data.index, data[colname[i]], color = \"black\")\n",
    "    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)\n",
    "    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')\n",
    "                \n",
    "    if(col == 1): col = 0\n",
    "    else: col = 1\n",
    "    cnt += 1\n",
    "    if(cnt % 2 == 0): row += 1\n",
    "\n",
    "ax[1, 1].set_visible(False)\n",
    "plt.show()\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure 4. The average trend of ‘economic issues’ topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')\n",
    "row = 0; col = 0; cnt = 0\n",
    "\n",
    "plt.subplots_adjust(left=0.125,\n",
    "                bottom=0.2, \n",
    "                right=0.9, \n",
    "                top=0.9, \n",
    "                wspace=0.1, \n",
    "                hspace=0.3)\n",
    "\n",
    "custom_ylim = (0, 0.15)\n",
    "plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "for i in [10, 28]:\n",
    "\n",
    "    lo = ax[row, col]\n",
    "    lo.plot(data.index, data[colname[i]], color = \"black\")\n",
    "    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)\n",
    "    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')\n",
    "                \n",
    "    if(col == 1): col = 0\n",
    "    else: col = 1\n",
    "    cnt += 1\n",
    "    if(cnt % 2 == 0): row += 1\n",
    "\n",
    "ax[1, 0].set_visible(False)        \n",
    "ax[1, 1].set_visible(False)\n",
    "plt.show()\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure 5. The average trend of ‘US camp and the military’ topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')\n",
    "row = 0; col = 0; cnt = 0\n",
    "\n",
    "plt.subplots_adjust(left=0.125,\n",
    "                bottom=0.2, \n",
    "                right=0.9, \n",
    "                top=0.9, \n",
    "                wspace=0.1, \n",
    "                hspace=0.3)\n",
    "\n",
    "custom_ylim = (0, 0.15)\n",
    "plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "for i in [8, 20]:\n",
    "\n",
    "    lo = ax[row, col]\n",
    "    lo.plot(data.index, data[colname[i]], color = \"black\")\n",
    "    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)\n",
    "    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')\n",
    "                \n",
    "    if(col == 1): col = 0\n",
    "    else: col = 1\n",
    "    cnt += 1\n",
    "    if(cnt % 2 == 0): row += 1\n",
    "\n",
    "ax[1, 0].set_visible(False)        \n",
    "ax[1, 1].set_visible(False)\n",
    "plt.show()\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure 6. The average trend of ‘broadly human rights related’ topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')\n",
    "row = 0; col = 0; cnt = 0\n",
    "\n",
    "plt.subplots_adjust(left=0.125,\n",
    "                bottom=0.2, \n",
    "                right=0.9, \n",
    "                top=0.9, \n",
    "                wspace=0.1, \n",
    "                hspace=0.3)\n",
    "\n",
    "custom_ylim = (0, 0.15)\n",
    "plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "for i in [15, 17, 32]:\n",
    "\n",
    "    lo = ax[row, col]\n",
    "    lo.plot(data.index, data[colname[i]], color = \"black\")\n",
    "    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)\n",
    "    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')\n",
    "                \n",
    "    if(col == 1): col = 0\n",
    "    else: col = 1\n",
    "    cnt += 1\n",
    "    if(cnt % 2 == 0): row += 1\n",
    "        \n",
    "ax[1, 1].set_visible(False)\n",
    "plt.show()\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure 7. The average trend of ‘youth and culture’ topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 18), nrows=3, ncols=2, sharey='col')\n",
    "row = 0; col = 0; cnt = 0\n",
    "\n",
    "plt.subplots_adjust(left=0.125,\n",
    "                bottom=0.2, \n",
    "                right=0.9, \n",
    "                top=0.9, \n",
    "                wspace=0.1, \n",
    "                hspace=0.3)\n",
    "\n",
    "custom_ylim = (0, 0.15)\n",
    "plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "for i in [5, 7, 12, 22, 26, 33]:\n",
    "\n",
    "    lo = ax[row, col]\n",
    "    lo.plot(data.index, data[colname[i]], color = \"black\")\n",
    "    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)\n",
    "    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')\n",
    "                \n",
    "    if(col == 1): col = 0\n",
    "    else: col = 1\n",
    "    cnt += 1\n",
    "    if(cnt % 2 == 0): row += 1\n",
    "        \n",
    "plt.show()\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure 8. The average trend of ‘technology’ topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(20, 12), nrows=2, ncols=2, sharey='col')\n",
    "row = 0; col = 0; cnt = 0\n",
    "\n",
    "plt.subplots_adjust(left=0.125,\n",
    "                bottom=0.2, \n",
    "                right=0.9, \n",
    "                top=0.9, \n",
    "                wspace=0.1, \n",
    "                hspace=0.3)\n",
    "\n",
    "custom_ylim = (0, 0.15)\n",
    "plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "for i in [11, 30, 31]:\n",
    "\n",
    "    lo = ax[row, col]\n",
    "    lo.plot(data.index, data[colname[i]], color = \"black\")\n",
    "    lo.set_xticklabels(data.index, rotation=45, ha='right', fontsize = 13)\n",
    "    lo.set_title((f'Topic {i+1}: {TP_name[i]}'), fontsize = 16, x=0.98, y=1.0, pad=-30, loc='right')\n",
    "                \n",
    "    if(col == 1): col = 0\n",
    "    else: col = 1\n",
    "    cnt += 1\n",
    "    if(cnt % 2 == 0): row += 1\n",
    "        \n",
    "ax[1, 1].set_visible(False)\n",
    "plt.show()\n",
    "fig.tight_layout()"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "PTM_twitter+covid-19.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Environment (conda_python3)",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
